/*******************************************************************************
Project: The Impact of Dodd-Frank and the Huawei Shock on DRC Tin Exports
Authors: Haruka Nagamori and Kazuhiko Nishimura
Script: Data_Integration.do
Purpose: Clean and merge mirror trade data (DRC & COG), exchange rates,
         LME tin prices, and Myanmar tin exports to create a monthly panel dataset.
Date: March 2026
*******************************************************************************/

clear all
set more off

* ==============================================================================
* 1. Trade Data Integration (DRC and COG)
* ==============================================================================

* Import DRC data
import delimited "drc_tin_imports_2009-2024_monthly.csv", clear varnames(1)
tempfile drc_data
save `drc_data', replace

* Import COG data and append DRC data
import delimited "cog_tin_imports_2009-2024_monthly.csv", clear varnames(1)
append using `drc_data'

* Clean quantity data (handle "NA")
destring quantity, replace ignore("NA") force

* Reviewer's suggestion: Treat COG exports as DRC exports
replace partnercode = "COD"
replace partnerdesc = "Dem. Rep. of the Congo"

* Aggregate duplicate reports (sum tradevalue and quantity for the same importer & month)
collapse (sum) tradevalue quantity (firstnm) partnercode partnerdesc reporterdesc, by(reportercode month_period)
replace quantity = . if quantity == 0

* Sort for subsequent merges
sort reportercode
tempfile main_trade
save `main_trade', replace


* ==============================================================================
* 2. Country-Currency Mapping
* ==============================================================================
import delimited "Country_Currency.csv", clear varnames(1) stringcols(_all)
rename countrycode reportercode
keep reportercode currencycode corrcurr
sort reportercode

* Merge with the main trade data
merge 1:m reportercode using `main_trade'
drop if _merge == 1 // Drop countries not in our trade data
drop _merge

rename currencycode repcurrcode
gen parcurrcode = "CDF"

sort month_period repcurrcode
save `main_trade', replace


* ==============================================================================
* 3. Exchange Rates Preparation & Merge (RFX, PFX, IVFX)
* ==============================================================================
import delimited "Exchange_Rates.csv", clear
drop time // drop unnecessary column
rename * rfx_*
rename rfx_month_period month_period

* Reshape to long format
reshape long rfx_, i(month_period) j(repcurrcode) string
replace repcurrcode = upper(repcurrcode)
rename rfx_ RFX

* Save base exchange rate dataset
sort month_period repcurrcode
tempfile exrates_base
save `exrates_base', replace

* --- Merge RFX ---
use `main_trade', clear
merge m:1 month_period repcurrcode using `exrates_base', gen(_merge_rfx)
drop if _merge_rfx == 2
drop _merge_rfx
save `main_trade', replace

* --- Merge PFX ---
use `exrates_base', clear
rename repcurrcode parcurrcode
rename RFX PFX
sort month_period parcurrcode
tempfile pfx_data
save `pfx_data', replace

use `main_trade', clear
merge m:1 month_period parcurrcode using `pfx_data', gen(_merge_pfx)
drop if _merge_pfx == 2
drop _merge_pfx
save `main_trade', replace

* --- Merge IVFX ---
use `exrates_base', clear
rename repcurrcode corrcurr
rename RFX IVFX
sort month_period corrcurr
tempfile ivfx_data
save `ivfx_data', replace

use `main_trade', clear
merge m:1 month_period corrcurr using `ivfx_data', gen(_merge_ivfx)
drop if _merge_ivfx == 2
drop _merge_ivfx
save `main_trade', replace


* ==============================================================================
* 4. LME Tin Prices (World Bank Pink Sheet)
* ==============================================================================
import excel "CMO-Historical-Data-Monthly.xlsx", sheet("Monthly Prices") clear
keep A BO
rename A month_period_str
rename BO LME_tin_price

* Drop non-data rows (keep only rows containing "M" like "2010M01")
drop if strpos(month_period_str, "M") == 0

* Convert to standard month_period format (e.g., 2010M01 -> 201001)
gen month_period = real(subinstr(month_period_str, "M", "", .))
destring LME_tin_price, replace force
drop month_period_str
drop if month_period == .

sort month_period
tempfile tin_price
save `tin_price', replace

* Merge LME Tin Price to main data
use `main_trade', clear
merge m:1 month_period using `tin_price', gen(_merge_tin)
drop if _merge_tin == 2
drop _merge_tin
rename LME_tin_price LME
save `main_trade', replace


* ==============================================================================
* 5. Myanmar (MMR) Tin Imports
* ==============================================================================
import delimited "mmr_tin_imports_2009-2024_monthly.csv", clear
keep month_period reportercode tradevalue
rename tradevalue fromMMR
sort month_period reportercode
tempfile mmr_data
save `mmr_data', replace

* Merge MMR data to main data
use `main_trade', clear
merge 1:1 month_period reportercode using `mmr_data', gen(_merge_mmr)
drop if _merge_mmr == 2
drop _merge_mmr

* Replace missing MMR imports with 0
replace fromMMR = 0 if fromMMR == .


* ==============================================================================
* 6. Panel Data Setup & Final Export
* ==============================================================================

* Create Stata monthly date variable for time series/panel operators (L., D., etc.)
gen year_num = floor(month_period / 100)
gen month_num = mod(month_period, 100)
gen mdate = ym(year_num, month_num)
format mdate %tm
drop year_num month_num

* Encode string reportercode into a numeric ID for panel setting
encode reportercode, gen(reporter_id)

* Set panel data structure
xtset reporter_id mdate

* Sort logically for the final dataset
order reporter_id reportercode mdate month_period partnercode partnerdesc tradevalue quantity repcurrcode parcurrcode corrcurr RFX PFX IVFX LME fromMMR
sort reporter_id mdate

* Export to CSV
export delimited "Data_Integrated.csv", replace

